#### Appendices 4.3 and 4.4: Descriptive Statistics on Supervised Learning Models ####

source("./code/loadPackages.R")

# Load in the results of predictions
load("./data/predictHawkMetrics.RData")

#### Table A6: Distribution of Estimated Coefficients across 1,000 Bootstrapped Models of Hawkishness ####
# Coefficients across all 1,000 iterations
coefsLong = coefs %>% pivot_longer(cols=everything())
coefStats = coefsLong %>% group_by(name) %>% summarize(lb=quantile(value, 0.025, na.rm=T),
                                                       mean=mean(value, na.rm=T),
                                                       ub=quantile(value, 0.975, na.rm=T))

# Export the main table
print(xtable(coefStats, digits=3), include.rownames=F)


#### Figure A4: Actual Versus Predicted Values of Hawkishness for FPLP Respondents ####
actualHawk = rbindlist(outTestActual) # All the actual MI scores in the FPLS
actualHawk = actualHawk %>% group_by(id) %>% summarize(actual=mean(actual))
testPredsSum = testPreds %>% group_by(id) %>% summarize(meanPred=mean(pred))
testPerf = merge(testPredsSum, actualHawk, by="id")
testPerf$avgError = testPerf$meanPred - testPerf$actual

predBox = testPerf %>% pivot_longer(c("meanPred", "actual"))
ggplot(predBox, aes(name, value)) + geom_boxplot() + coord_flip() + theme_bw() +
  ylab("Hawkishness") +
  scale_x_discrete("", labels=c("Actual", "Prediction"))
ggsave("./figures/predVsActualBox.pdf", width=4.25, height=2)
# Predictions do not reach as extreme of hawkishness values; are compressed on the spectrum.

#### Figure A5: Actual Hawkishness Versus Prediction Error for FPLP Respondents ####
ggplot(testPerf, aes(actual, avgError)) + geom_hline(yintercept=0, linetype=2) + 
  geom_point(color="gray40", alpha=0.2, size=0.7) + 
  geom_smooth(method="lm") + xlab("Hawkishness") + ylab("Average Prediction Error") + theme_bw()
ggsave("./figures/hawkVsError.jpg", width=4, height=3)
# The model makes larger errors when individuals have extreme measures of hawkishness or dovishness.
# When someone in the FPLS is very hawkish, the model tends to underestimate their hawkishness.
# When someone in the FPLS is very dovish, the model tends to overestimate their hawkishness. 


#### Table A7: Out-of-Sample Performance Metrics for Two Linear Models ####
apply(resampTest, 2, mean)         # Boosted model, out-of-sample
apply(resampTest3, 2, mean)        # OLS, out-of-sample
apply(resampTrain[,1:3], 2, mean)  # Boosted model
apply(resampTrain3[,1:3], 2, mean) # OLS

sd(abs(errors))       # Boosted model (0.1149)
sd(abs(errors3))      # OLS (0.1153)


#### Table A8: Comparison of Mean Values across Decision-Maker Dataset and the FPLP Survey ####
prop.table(table(fpls$birthDecade))

# Make separate covariates for birth decades
fpls$bornPre1910 = ifelse(fpls$birthDecade==0, 1, 0)
fpls$born1910s = ifelse(fpls$birthDecade==1, 1, 0)
fpls$born1920s = ifelse(fpls$birthDecade==2, 1, 0)
fpls$born1930s = ifelse(fpls$birthDecade==3, 1, 0)
fpls$bornPost1940 = ifelse(fpls$birthDecade==4, 1, 0)

act$bornPre1910 = ifelse(act$birthDecade==0, 1, 0)
act$born1910s = ifelse(act$birthDecade==1, 1, 0)
act$born1920s = ifelse(act$birthDecade==2, 1, 0)
act$born1930s = ifelse(act$birthDecade==3, 1, 0)
act$bornPost1940 = ifelse(act$birthDecade==4, 1, 0)

# Define the variables to compare
predVars = c("male", "bornPre1910", "born1910s", "born1920s", "born1930s", "bornPost1940", "CollegeGrad", "MA", "MBA", "LLBJD", "MD", 
             "PhD", "MilService", "wwii", "korea", "vietnam", "MilOfficer", "FSO", "Rep", "Dem")

# Extract summary statistics for these variables from the FPLP survey
fplsLong = fpls %>% dplyr::select(all_of(predVars)) %>% pivot_longer(everything())
fplsStats = fplsLong %>% group_by(name) %>% summarize(mean=mean(value))

# Extract summary statistics for these variables from our actors
actLong = as.data.frame(act) %>% dplyr::select(all_of(predVars)) %>% pivot_longer(everything())
actStats = actLong %>% group_by(name) %>% summarize(mean=mean(value))

# Produce table of comparisons
compTable = xtable(data.frame(var=actStats$name, actMean=actStats$mean, fplsMean=fplsStats$mean), digits=3)
print(compTable, include.rownames=F)